{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "stopped-brush",
   "metadata": {},
   "source": [
    "# DESCRIPTIVE STATS FOR HATECHECK CASES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "vital-insert",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialise relevant packages\n",
    "\n",
    "# Basics\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import string\n",
    "\n",
    "# Lexical diversity\n",
    "from lexical_diversity import lex_div as ld #https://pypi.org/project/lexical-diversity/"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "popular-height",
   "metadata": {},
   "source": [
    "## Load Test Suite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "eight-opportunity",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load test suite\n",
    "hatecheck_df = pd.read_csv('./Data/Test Suite/hatecheck_final_ACL.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "spatial-moral",
   "metadata": {},
   "source": [
    "## Calculate Average Test Case Length --> Characters and Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "hazardous-characteristic",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CASE LENGTH: CHARS\n",
      "count    3728.000000\n",
      "mean       48.263948\n",
      "std        16.875595\n",
      "min        11.000000\n",
      "25%        35.000000\n",
      "50%        46.000000\n",
      "75%        59.000000\n",
      "max       104.000000\n",
      "Name: test_case_len_chars, dtype: float64 \n",
      "\n",
      "CASE LENGTH: WORDS\n",
      "count    3728.000000\n",
      "mean        8.867758\n",
      "std         3.326351\n",
      "min         2.000000\n",
      "25%         6.000000\n",
      "50%         9.000000\n",
      "75%        11.000000\n",
      "max        20.000000\n",
      "Name: test_case_len_words, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "### CASE LENGTH: CHARS\n",
    "hatecheck_df[\"test_case_len_chars\"] = hatecheck_df.test_case.apply(lambda x: len(x))\n",
    "print(\"CASE LENGTH: CHARS\")\n",
    "print(hatecheck_df.test_case_len_chars.describe(), \"\\n\")\n",
    "\n",
    "### CASE LENGTH: WORDS\n",
    "def text_to_word_list(case):\n",
    "    case = case.translate(str.maketrans('', '', string.punctuation))\n",
    "    return case.split()\n",
    "\n",
    "hatecheck_df[\"test_case_words\"] = hatecheck_df.test_case.apply(lambda x: text_to_word_list(x))\n",
    "hatecheck_df[\"test_case_len_words\"] = hatecheck_df.test_case_words.apply(lambda x: len(x))\n",
    "print(\"CASE LENGTH: WORDS\")\n",
    "print(hatecheck_df.test_case_len_words.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deadly-appreciation",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
